This tutorial uses fake data developed through the synthpop package in R.
library(tidyverse) # working with datasets
library(specr) # specification curves
library(mice) # multiple imputation
library(semTools) # missing data information
df = read.csv("crea_data.csv") # loading dataset
head(df) # checking data
## SubID Group Age Schoolgrade Gender Race Ethnicity Income
## 1 PA001_V1 DA 7.381246 1 0 5 1 1.8307418
## 2 PA002_V1 C 8.427105 3 1 4 0 7.8539171
## 3 PA003_V1 PI 8.588638 4 0 2 0 12.3572735
## 4 PA007_V1 DC 7.411362 2 0 5 0 0.3388797
## 5 PA008_V1 DA 10.105407 5 1 6 0 1.8353522
## 6 PA009_V1 C 11.904175 6 0 6 1 2.8364534
## Care_Edu Reporter_relation Attachment_security_8 Attachment_security_4
## 1 19 1 0.67261422 1.0591693
## 2 16 1 1.04733955 0.9418006
## 3 18 1 NA 0.1202198
## 4 14 0 1.15440393 1.0591693
## 5 19 1 1.31500049 1.1765380
## 6 20 1 -0.02330423 -0.2318863
## PBS CBCL WIAT_W WIAT_P WIAT_N WASI_VOCAB
## 1 0.6079087 2.8662440 NA NA NA -2.16061326
## 2 -0.4036524 -0.5215826 NA NA NA -0.09780554
## 3 0.3345138 -0.5215826 NA NA NA -0.44160683
## 4 NA -0.8980078 -0.2570949 -0.8328702 -0.9655284 -0.52755715
## 5 -1.3878740 -0.8980078 0.6773165 0.8972241 1.2011991 0.24599575
## 6 1.8381858 -0.5215826 0.1517101 0.3411224 -0.3282556 -0.18375586
## WASI_MATRIX crEA
## 1 1.5240056 1
## 2 0.1809921 0
## 3 -0.1547613 1
## 4 1.5240056 1
## 5 -0.7423298 1
## 6 -0.1547613 0
# correcting variable types (making sure categorical variables are coded as factors)
df = df %>% mutate(
Gender = as.factor(Gender),
Race = as.factor(Race),
Ethnicity = as.factor(Ethnicity),
)
md.pattern(df, rotate.names = TRUE) # pattern of missing data
## SubID Group Age Schoolgrade Gender Race Ethnicity crEA
## 157 1 1 1 1 1 1 1 1
## 55 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 40 1 1 1 1 1 1 1 1
## 12 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0
## Attachment_security_4 WASI_MATRIX WASI_VOCAB Attachment_security_8 Income
## 157 1 1 1 1 1
## 55 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 40 1 1 1 1 1
## 12 1 1 1 1 1
## 9 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 4 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 2 1 1 1 1 0
## 1 1 1 1 1 0
## 1 1 1 1 1 0
## 5 1 1 1 0 1
## 1 1 1 1 0 1
## 2 1 1 1 0 1
## 1 1 1 1 0 1
## 3 1 1 0 1 1
## 1 1 1 0 1 1
## 1 1 1 0 1 1
## 3 1 0 1 1 1
## 1 1 0 1 1 1
## 2 0 1 1 1 1
## 1 0 1 0 1 1
## 3 4 6 9 12
## Care_Edu Reporter_relation CBCL WIAT_N WIAT_W WIAT_P PBS
## 157 1 1 1 1 1 1 1 0
## 55 1 1 1 1 1 1 0 1
## 1 1 1 1 1 0 0 1 2
## 2 1 1 1 1 0 0 0 3
## 1 1 1 1 0 1 1 1 1
## 2 1 1 1 0 1 0 1 2
## 40 1 1 1 0 0 0 1 3
## 12 1 1 1 0 0 0 0 4
## 9 1 1 0 1 1 1 1 1
## 1 1 1 0 1 1 1 0 2
## 2 1 1 0 0 0 0 1 4
## 4 1 0 1 1 1 1 1 1
## 2 1 0 1 1 1 1 0 2
## 1 1 0 1 0 0 0 1 4
## 1 0 1 1 1 1 1 1 1
## 1 0 1 1 1 1 1 0 2
## 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0 2
## 1 0 1 1 1 1 1 1 2
## 1 0 1 1 1 1 1 0 3
## 1 0 1 1 0 0 0 1 5
## 1 0 0 1 1 1 1 1 3
## 1 0 0 1 1 1 1 0 4
## 1 0 0 1 0 0 0 0 7
## 2 0 0 0 1 1 1 1 4
## 1 0 0 0 1 1 1 0 5
## 1 0 0 0 1 0 0 0 7
## 5 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0 2
## 2 1 1 1 0 0 0 1 4
## 1 1 1 0 1 1 1 0 3
## 3 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 0 4
## 1 1 1 1 0 0 0 1 4
## 3 1 1 1 1 1 1 1 1
## 1 1 1 1 0 0 0 0 5
## 2 1 1 1 1 1 1 0 2
## 1 1 1 1 1 1 1 1 2
## 12 14 17 64 66 68 85 360
mean(is.na(df$PBS)) # checking percentage of missing data
## [1] 0.2623457
mean(is.na(df$WIAT_N))
## [1] 0.1975309
ini = mice(df, seed = 329, print = F, m = 5) # dry run of imputations to see how it works
summary(ini) # summary of imputed object
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## SubID Group Age
## "" "" ""
## Schoolgrade Gender Race
## "" "" ""
## Ethnicity Income Care_Edu
## "" "pmm" "pmm"
## Reporter_relation Attachment_security_8 Attachment_security_4
## "pmm" "pmm" "pmm"
## PBS CBCL WIAT_W
## "pmm" "pmm" "pmm"
## WIAT_P WIAT_N WASI_VOCAB
## "pmm" "pmm" "pmm"
## WASI_MATRIX crEA
## "pmm" ""
## PredictorMatrix:
## SubID Group Age Schoolgrade Gender Race Ethnicity Income Care_Edu
## SubID 0 0 1 1 1 1 1 1 1
## Group 0 0 1 1 1 1 1 1 1
## Age 0 0 0 1 1 1 1 1 1
## Schoolgrade 0 0 1 0 1 1 1 1 1
## Gender 0 0 1 1 0 1 1 1 1
## Race 0 0 1 1 1 0 1 1 1
## Reporter_relation Attachment_security_8 Attachment_security_4 PBS
## SubID 1 1 1 1
## Group 1 1 1 1
## Age 1 1 1 1
## Schoolgrade 1 1 1 1
## Gender 1 1 1 1
## Race 1 1 1 1
## CBCL WIAT_W WIAT_P WIAT_N WASI_VOCAB WASI_MATRIX crEA
## SubID 1 1 1 1 1 1 1
## Group 1 1 1 1 1 1 1
## Age 1 1 1 1 1 1 1
## Schoolgrade 1 1 1 1 1 1 1
## Gender 1 1 1 1 1 1 1
## Race 1 1 1 1 1 1 1
## Number of logged events: 27
## it im dep meth out
## 1 0 0 constant SubID
## 2 0 0 constant Group
## 3 1 1 PBS pmm Race5
## 4 1 2 PBS pmm Race5
## 5 1 3 PBS pmm Race5
## 6 1 4 PBS pmm Race5
fit <- with(ini, lm(CBCL ~ crEA)) # linear regression on each imputed dataset separately
summary(pool(fit)) # combining information from multiple datasets
## term estimate std.error statistic df p.value
## 1 (Intercept) -0.2767081 0.1034556 -2.674654 301.0944 7.889412e-03
## 2 crEA 0.4882505 0.1235432 3.952062 290.1153 9.738233e-05
## Example for how to check for variables related to missigness. For the sake of today's workshop, we will include all variables in the imputation model (better to err on the side of more!).
df = df %>% mutate (pbs_missing = is.na(df$PBS)) # create variable which specifies missingness
t.test(df$Age ~ df$pbs_missing) # identifying continuous variables related to missingness
##
## Welch Two Sample t-test
##
## data: df$Age by df$pbs_missing
## t = -0.14344, df = 133.1, p-value = 0.8862
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -0.6098686 0.5273964
## sample estimates:
## mean in group FALSE mean in group TRUE
## 9.297283 9.338519
summary(glm(data = df, formula = pbs_missing ~ Gender, family = binomial(link='logit'))) # identifying categorical variables related to missingness
##
## Call:
## glm(formula = pbs_missing ~ Gender, family = binomial(link = "logit"),
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9110 -0.9110 -0.6451 1.4696 1.8286
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4639 0.1993 -7.345 2.06e-13 ***
## Gender1 0.7989 0.2603 3.069 0.00215 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 372.92 on 323 degrees of freedom
## Residual deviance: 363.20 on 322 degrees of freedom
## AIC: 367.2
##
## Number of Fisher Scoring iterations: 4
pred = ini$predictorMatrix # which variables predicted what - can be useful when you don't want certain variables to be predictors. MICE doesn't allow you to remove a variable from being predicted unless you also want to remove it from being a predictor (you will have to create your own prediction matrix for that)
## 1 represents that column variable was used to predict the row variable
pred[ , c('Attachment_security_4')] <- 0 # remove from predicting all variables
pred['PBS', 'Age'] <- 0 # remove specific predictors for specific outcomes
# percentage of missing data: minimum number of datasets to impute
mean(is.na(df$PBS))
## [1] 0.2623457
# fraction of missing information (FMI/m should be less than 0.01)
fmi(df, method = "saturated", group = NULL, ords = NULL,
varnames = NULL, exclude = NULL, fewImps = FALSE)
## $Covariances
## $Covariances$coef
## Age Schlgr Income Car_Ed Rprtr_ Att__8 Att__4 PBS
## Age 4.563
## Schoolgrade 4.375 4.440
## Income -0.276 -0.336 19.778
## Care_Edu -0.185 -0.260 3.675 9.176
## Reporter_relation 0.066 0.071 -0.106 -0.257 0.169
## Attachment_security_8 0.049 0.065 -0.214 0.094 -0.025 0.914
## Attachment_security_4 0.068 0.073 -0.193 0.069 -0.018 0.905 0.929
## PBS 0.140 0.081 0.425 0.477 -0.092 0.009 0.014 1.016
## CBCL -0.247 -0.285 0.003 0.093 -0.015 -0.084 -0.082 -0.212
## WIAT_W 0.054 0.081 0.510 0.167 -0.034 0.058 0.029 0.071
## WIAT_P 0.094 0.159 0.394 0.151 0.022 0.120 0.099 0.019
## WIAT_N -0.139 -0.098 0.430 0.679 -0.025 -0.029 -0.061 0.005
## WASI_VOCAB -0.101 -0.101 1.387 0.169 -0.052 -0.008 -0.020 0.158
## WASI_MATRIX 0.016 0.021 0.366 0.330 -0.009 -0.111 -0.114 0.092
## crEA 0.079 0.069 -0.042 0.127 0.019 -0.006 -0.003 -0.072
## CBCL WIAT_W WIAT_P WIAT_N WASI_V WASI_M crEA
## Age
## Schoolgrade
## Income
## Care_Edu
## Reporter_relation
## Attachment_security_8
## Attachment_security_4
## PBS
## CBCL 1.051
## WIAT_W -0.189 1.022
## WIAT_P -0.142 0.750 0.927
## WIAT_N -0.161 0.624 0.482 1.056
## WASI_VOCAB -0.020 0.465 0.395 0.411 0.945
## WASI_MATRIX -0.113 0.507 0.433 0.462 0.271 0.989
## crEA 0.105 -0.133 -0.077 -0.064 -0.056 -0.077 0.207
##
## $Covariances$fmi
## Age Schlgr Income Car_Ed Rprtr_ Att__8 Att__4 PBS
## Age 0.000
## Schoolgrade 0.000 0.000
## Income 0.054 0.062 0.038
## Care_Edu 0.062 0.067 0.036 0.032
## Reporter_relation 0.053 0.059 0.031 0.047 0.046
## Attachment_security_8 0.001 0.001 0.018 0.018 0.046 0.000
## Attachment_security_4 0.000 0.000 0.016 0.019 0.058 0.000 0.001
## PBS 0.280 0.292 0.143 0.299 0.379 0.253 0.264 0.255
## CBCL 0.052 0.059 0.051 0.088 0.100 0.036 0.036 0.221
## WIAT_W 0.174 0.163 0.253 0.145 0.194 0.133 0.154 0.355
## WIAT_P 0.195 0.180 0.269 0.154 0.194 0.149 0.172 0.340
## WIAT_N 0.168 0.158 0.265 0.150 0.181 0.140 0.162 0.295
## WASI_VOCAB 0.011 0.009 0.029 0.036 0.045 0.014 0.012 0.250
## WASI_MATRIX 0.019 0.016 0.109 0.036 0.056 0.006 0.005 0.182
## crEA 0.000 0.000 0.020 0.015 0.022 0.001 0.000 0.211
## CBCL WIAT_W WIAT_P WIAT_N WASI_V WASI_M crEA
## Age
## Schoolgrade
## Income
## Care_Edu
## Reporter_relation
## Attachment_security_8
## Attachment_security_4
## PBS
## CBCL 0.054
## WIAT_W 0.155 0.220
## WIAT_P 0.166 0.215 0.220
## WIAT_N 0.191 0.214 0.212 0.230
## WASI_VOCAB 0.058 0.160 0.177 0.166 0.019
## WASI_MATRIX 0.052 0.143 0.161 0.151 0.028 0.016
## crEA 0.045 0.219 0.253 0.242 0.010 0.013 0.000
##
##
## $Means
## variable group coef fmi
## 121 Age 1 9.308 0.000
## 122 Schoolgrade 1 3.563 0.000
## 123 Income 1 4.434 0.034
## 124 Care_Edu 1 16.113 0.035
## 125 Reporter_relation 1 0.809 0.042
## 126 Attachment_security_8 1 0.027 0.001
## 127 Attachment_security_4 1 0.020 0.000
## 128 PBS 1 0.049 0.259
## 129 CBCL 1 0.077 0.048
## 130 WIAT_W 1 0.187 0.181
## 131 WIAT_P 1 0.139 0.202
## 132 WIAT_N 1 0.105 0.184
## 133 WASI_VOCAB 1 0.036 0.013
## 134 WASI_MATRIX 1 -0.051 0.009
## 135 crEA 1 0.707 0.000
imp_mypred <- mice(df, predictorMatrix = pred, seed = 329, print=F, maxit = 10, m = 10) # using our own prediction matrix -- change maxit and mice to increase number of iterations and number of imputed data sets respectively
imp_quickpred = mice(df, pred=quickpred(df, mincor=.3), seed = 329, print=F) # the second option is to use pre-set criteria for which variables shouldbe used to predict
plot(imp_mypred) # check convergence of data sets - want them to be free of any trends in later iterations
stripplot(imp_mypred, CBCL~.imp, pch = 20, cex = 2) # check reasonability of imputed values
summary(with(imp_mypred, mean(PBS))) ## mean from imputed data set
## # A tibble: 10 × 1
## x
## <dbl>
## 1 -0.000395
## 2 0.0575
## 3 0.0610
## 4 0.0287
## 5 0.0562
## 6 0.0149
## 7 0.0452
## 8 0.0806
## 9 -0.0368
## 10 0.0250
summary(mean(df$PBS, na.rm = TRUE)) ## mean from non-imputed dataset
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03652 0.03652 0.03652 0.03652 0.03652 0.03652
# First - save the imputed data so that you don't have to do it again and again
save(imp_mypred, file = "impdata.rda")
# Load the data everytime you want to use it
load("impdata.rda")
# For specification curves - we will use the long form of the data
c.long = complete(imp_mypred, 'long') # convert imputed data into long form dataframe
c.broad = complete(imp_mypred, 'broad') # convert imputed data into broad form dataframe
## New names:
## * SubID -> SubID...1
## * Group -> Group...2
## * Age -> Age...3
## * Schoolgrade -> Schoolgrade...4
## * Gender -> Gender...5
## * ...
spec_nonimp = run_specs(df = df,
x = c("crEA"),
y = c("CBCL"),
controls = c("Age", "Gender", "Race", "Ethnicity"),
model = c("lm"),
all.comb = TRUE)
plot_specs(spec_nonimp) # visualize
# running specification curve
spec_imp <- c.long %>%
group_by(.imp) %>% # group data based on which imputed data set is
nest() %>% # creates data frames within data frames
mutate(specs = purrr::map(data, ~run_specs(df = .,
x = c("crEA"),
y = c("CBCL"),
controls = c("Age", "Gender", "Race", "Ethnicity"),
model = c("lm"),
all.comb = TRUE))) # create a separate specification curve for each imputed data set
# unnest specification curves
unnested_specs <- spec_imp %>% unnest(specs) # 'takes out' the specifications from the dataframe
# Function for pooling standard deviation across the imputations
pooled_sd = function(means, sds){
n = length(means)
variance_within = (1/n)*sum(sds^2)
variance_between = var(means)
variance_total = variance_within + variance_between + (variance_between/n)
return(sqrt(variance_total))
}
#### grouping and getting mean estimate (rubin's rules)
mean_estimate <- unnested_specs %>%
group_by(controls, y, x, model, subsets) %>%
summarize(std.error = pooled_sd(means = estimate,
sds = std.error),
estimate = mean(estimate)
) %>%
mutate(conf.low = estimate-2*std.error,
conf.high = estimate+2*std.error,
statistic = estimate/std.error) %>%
ungroup()
## `summarise()` has grouped output by 'controls', 'y', 'x', 'model'. You can override using the `.groups` argument.
plot_specs(mean_estimate) # visualize
## running specification curve
spec_imp_attach <- c.long %>%
group_by(.imp) %>% # group data based on which imputed data set is
nest() %>% # creates data frames within data frames
mutate(specs = purrr::map(data, ~run_specs(df = .,
x = c("Attachment_security_4", "Attachment_security_8"),
y = c("CBCL"),
controls = c("Age", "Gender", "Race", "Ethnicity"),
model = c("lm"),
all.comb = TRUE))) # create a separate specification curve for each imputed data set
#### unnest specification curves
unnested_specs_attach <- spec_imp_attach %>% unnest(specs) # 'takes out' the specifications from the dataframe
#### grouping and getting mean estimate (rubin's rules)
mean_estimate_attach <- unnested_specs_attach %>%
group_by(controls, y, x, model, subsets) %>%
summarize(std.error = pooled_sd(means = estimate,
sds = std.error),
estimate = mean(estimate)
) %>%
mutate(conf.low = estimate-2*std.error,
conf.high = estimate+2*std.error,
statistic = estimate/std.error) %>%
ungroup()
## `summarise()` has grouped output by 'controls', 'y', 'x', 'model'. You can override using the `.groups` argument.
plot_specs(mean_estimate_attach) # visualize